// Copyright 2020 Robert Balas
// SPDX-License-Identifier: SHL-0.51

// simple minimal rv32i core
//
// inspired by Computer Architecture: A Quantitive Approach p. C-30,
// darkriscv.v, RI5CY and riscv-sodor
//
// We really don't care about modularization.
// CPI and longest path is just a number.
//
// Author: Robert Balas (balasr@iis.ee.ethz.ch)

module catv_riscv #(
  parameter logic [31:0] BOOT_ADDR = 32'h0000_2000
) (
  input logic         clk_i,
  input logic         rst_ni,

  // instruction fetch port
  output logic [31:0] insn_addr_o,
  output logic        insn_valid_o,
  input logic         insn_ready_i,
  input logic [31:0]  insn_data_i,
  input logic         insn_rvalid_i,

  // data interface
  output logic [31:0] data_addr_o,
  output logic        data_wen_o,
  output logic [31:0] data_wdata_o,
  input logic         data_rvalid_i,
  input logic [31:0]  data_rdata_i,
  output logic [3:0]  data_strb_o,
  output logic        data_valid_o,
  input logic         data_ready_i,

  // configuration
  input logic [19:0]  hartid_i
);

  // Automatically generated by parse_opcodes
  // rv32i
  localparam [31:0] BEQ                = 32'b?????????????????000?????1100011;
  localparam [31:0] BNE                = 32'b?????????????????001?????1100011;
  localparam [31:0] BLT                = 32'b?????????????????100?????1100011;
  localparam [31:0] BGE                = 32'b?????????????????101?????1100011;
  localparam [31:0] BLTU               = 32'b?????????????????110?????1100011;
  localparam [31:0] BGEU               = 32'b?????????????????111?????1100011;
  localparam [31:0] JALR               = 32'b?????????????????000?????1100111;
  localparam [31:0] JAL                = 32'b?????????????????????????1101111;
  localparam [31:0] LUI                = 32'b?????????????????????????0110111;
  localparam [31:0] AUIPC              = 32'b?????????????????????????0010111;
  localparam [31:0] ADDI               = 32'b?????????????????000?????0010011;
  localparam [31:0] SLLI               = 32'b000000???????????001?????0010011;
  localparam [31:0] SLTI               = 32'b?????????????????010?????0010011;
  localparam [31:0] SLTIU              = 32'b?????????????????011?????0010011;
  localparam [31:0] XORI               = 32'b?????????????????100?????0010011;
  localparam [31:0] SRLI               = 32'b000000???????????101?????0010011;
  localparam [31:0] SRAI               = 32'b010000???????????101?????0010011;
  localparam [31:0] ORI                = 32'b?????????????????110?????0010011;
  localparam [31:0] ANDI               = 32'b?????????????????111?????0010011;
  localparam [31:0] ADD                = 32'b0000000??????????000?????0110011;
  localparam [31:0] SUB                = 32'b0100000??????????000?????0110011;
  localparam [31:0] SLL                = 32'b0000000??????????001?????0110011;
  localparam [31:0] SLT                = 32'b0000000??????????010?????0110011;
  localparam [31:0] SLTU               = 32'b0000000??????????011?????0110011;
  localparam [31:0] XOR                = 32'b0000000??????????100?????0110011;
  localparam [31:0] SRL                = 32'b0000000??????????101?????0110011;
  localparam [31:0] SRA                = 32'b0100000??????????101?????0110011;
  localparam [31:0] OR                 = 32'b0000000??????????110?????0110011;
  localparam [31:0] AND                = 32'b0000000??????????111?????0110011;
  localparam [31:0] LB                 = 32'b?????????????????000?????0000011;
  localparam [31:0] LH                 = 32'b?????????????????001?????0000011;
  localparam [31:0] LW                 = 32'b?????????????????010?????0000011;
  localparam [31:0] LBU                = 32'b?????????????????100?????0000011;
  localparam [31:0] LHU                = 32'b?????????????????101?????0000011;
  localparam [31:0] SB                 = 32'b?????????????????000?????0100011;
  localparam [31:0] SH                 = 32'b?????????????????001?????0100011;
  localparam [31:0] SW                 = 32'b?????????????????010?????0100011;
  localparam [31:0] FENCE              = 32'b?????????????????000?????0001111;
  localparam [31:0] FENCE_I            = 32'b?????????????????001?????0001111;
  // system
  localparam [31:0] ECALL              = 32'b00000000000000000000000001110011;
  localparam [31:0] EBREAK             = 32'b00000000000100000000000001110011;
  localparam [31:0] URET               = 32'b00000000001000000000000001110011;
  localparam [31:0] SRET               = 32'b00010000001000000000000001110011;
  localparam [31:0] MRET               = 32'b00110000001000000000000001110011;
  localparam [31:0] DRET               = 32'b01111011001000000000000001110011;
  localparam [31:0] SFENCE_VMA         = 32'b0001001??????????000000001110011;
  localparam [31:0] WFI                = 32'b00010000010100000000000001110011;
  localparam [31:0] CSRRW              = 32'b?????????????????001?????1110011;
  localparam [31:0] CSRRS              = 32'b?????????????????010?????1110011;
  localparam [31:0] CSRRC              = 32'b?????????????????011?????1110011;
  localparam [31:0] CSRRWI             = 32'b?????????????????101?????1110011;
  localparam [31:0] CSRRSI             = 32'b?????????????????110?????1110011;
  localparam [31:0] CSRRCI             = 32'b?????????????????111?????1110011;
  localparam [31:0] HFENCE_VVMA        = 32'b0010001??????????000000001110011;
  localparam [31:0] HFENCE_GVMA        = 32'b0110001??????????000000001110011;


  // pc things
  logic [31:0] pc_q, pc_d;
  logic [31:0] pc_next;
  logic        insn_ok, stall;
  logic        fetch_ok;

  // gpr
  logic [31:0] reg_a, reg_b; // read ports
  logic        reg_write_valid;

  // alu
  logic [31:0] alu_op_a;
  logic [31:0] alu_op_b;
  logic [31:0] alu_result;
  enum logic [3:0] {
    OP_ADD, OP_SUB, OP_SLL,
    OP_SEQ, OP_SNE, OP_SLT,
    OP_SGE, OP_SLTU, OP_SGEU,
    OP_XOR, OP_SRL, OP_SRA,
    OP_OR, OP_AND
  } alu_op;

  // load store
  logic load_stall;
  logic store_stall;
  logic is_load;
  logic is_store;

  enum logic [3:0] {
    BYTE = 4'b0001, HALFWORD = 4'b0011, WORD = 4'b1111
  } ls_strobe;

  logic ls_signext;

  // writeback path
  logic commit;
  logic [31:0] wb_value;
  enum logic [1:0] {
    WB_RET_PC, WB_ALU, WB_LOAD
  } wb_mux;

  // exploded insn
  logic [4:0] rs1, rs2, rd;
  logic [2:0] funct3;
  logic [6:0] funct7;
  logic [31:0] j_imm, u_imm, b_imm, s_imm, i_imm;

  // instruction fetching
  // For a certain pc_q, keep the insn_valid and insn_data stable.
  logic        insn_illegal;
  logic        insn_valid_q, insn_valid_d;
  logic [31:0] insn_data, insn_data_q, insn_data_d;

  // TODO: this only fetches every other cycle. Make it faster.
  assign fetch_ok = insn_valid_o & insn_ready_i;
  //assign insn_ok = insn_rvalid_i;

  // forward from insn port to save one cycle (hack)
  assign insn_ok   = insn_rvalid_i;
  assign insn_data = insn_rvalid_i ? insn_data_i : insn_data_q;
  assign stall = ~insn_rvalid_i | load_stall | store_stall;
  assign insn_addr_o = pc_q;

  typedef enum logic [0:0] {
    FETCH, WAIT
  } insn_fetch_e;
  insn_fetch_e insn_fetch_q, insn_fetch_d;

  always_comb begin : fetch_next
    insn_fetch_d = insn_fetch_q;
    insn_valid_o = 1'b0;
    insn_data_d  = insn_data_q;
    insn_valid_d = 1'b0;
    case (insn_fetch_q)
      FETCH: begin
        insn_valid_o = 1'b1;
        if (insn_ready_i) insn_fetch_d = WAIT;
      end
      WAIT: begin
        insn_valid_o = 1'b0;
        if (insn_rvalid_i) begin // latch insn data to keep it stable
          insn_valid_d = 1'b1;
          insn_data_d  = insn_data_i;
        end
        if (commit) begin
          insn_fetch_d = FETCH;
        end
      end
      default:;
    endcase // case (insn_fetch)
  end

  always_ff @(posedge clk_i, negedge rst_ni) begin : fetch
    if (!rst_ni) begin
      insn_fetch_q <= FETCH;
      insn_valid_q <= 1'b0;
      insn_data_q  <= 32'b0;
    end else begin
      insn_fetch_q <= insn_fetch_d;
      insn_valid_q <= insn_valid_d;
      insn_data_q  <= insn_data_d;
    end
  end : fetch

  // pc udpate logic
  logic pc_clear_lsb;
  enum logic [1:0] {
    PC_EXCEPTION, PC_NEXT, PC_ALU, PC_BRANCH
  } pc_mux;

  assign pc_next = pc_q + 32'd4;

`ifdef __ICARUS__
  always @(*) begin : program_counter_mux
`else
  always_comb begin : program_counter_mux
`endif
    case (pc_mux)
      PC_EXCEPTION: pc_d = BOOT_ADDR; // TODO: fix nonsense
      PC_NEXT:      pc_d = pc_next;
      //PC_ALU:       pc_d = alu_result & (32'hffff_fffe | ~pc_clear_lsb); // risc-v spec p.21 zero last bit
      PC_ALU:       pc_d = { alu_result[31:1], alu_result[0] & ~pc_clear_lsb };
      PC_BRANCH:    pc_d = pc_q + (alu_result[0] ? b_imm : 32'd4); // annoying: either use alu for computing branch or computing condition. we choose later
      default: pc_d = pc_q;
    endcase // case (pc_mux)
  end : program_counter_mux

  always_ff @(posedge clk_i, negedge rst_ni) begin : pc_update
    if (!rst_ni) begin
      pc_q <= BOOT_ADDR;
    end else begin
      if (commit) begin
        pc_q <= pc_d;
      end
    end
  end : pc_update

  // operand mux states
  typedef enum logic [2:0] {
    MUX_ZERO, MUX_PC, MUX_REG, MUX_U_IMM, MUX_I_IMM, MUX_J_IMM, MUX_S_IMM
  } op_mux_e;
  op_mux_e op_a_mux, op_b_mux;

  // immediates, sign extended (risc-v spec p. 17)
  assign j_imm = $signed({insn_data[31], insn_data[19:12], insn_data[20], insn_data[30:21], 1'b0});
  assign u_imm =         {insn_data[31:12], 12'b0};
  assign b_imm = $signed({insn_data[31], insn_data[7], insn_data[30:25], insn_data[11:8], 1'b0});
  assign s_imm = $signed({insn_data[31:25], insn_data[11:7]});
  assign i_imm = $signed({insn_data[31:20]});

  // decoder (1)
  assign rs1    = insn_data[19:15];
  assign rs2    = insn_data[24:20];
  assign rd     = insn_data[11:7];
  assign funct3 = insn_data[14:12];
  assign funct7 = insn_data[31:25];

`define decode_regular_insn(__name, __op_a_mux, __op_b_mux, __wb_mux, __pc_mux, __alu_op, __reg_write) \
  __name: begin                                                                                        \
    op_a_mux = __op_a_mux; op_b_mux = __op_b_mux;                                                      \
    wb_mux = __wb_mux; pc_mux = __pc_mux;                                                              \
    alu_op = __alu_op;                                                                                 \
    reg_write_valid = __reg_write;                                                                     \
  end

`define decode_load_insn(__name, __op_a_mux, __op_b_mux, __wb_mux, __alu_op, __reg_write, __ls_strobe, __is_signext) \
  __name: begin                                                                                                      \
    op_a_mux = __op_a_mux; op_b_mux = __op_b_mux;                                                                    \
    wb_mux = __wb_mux;                                                                                               \
    alu_op = __alu_op;                                                                                               \
    is_load = 1'b1;                                                                                                  \
    ls_strobe = __ls_strobe;                                                                                         \
    ls_signext = __is_signext;                                                                                       \
    reg_write_valid = __reg_write;                                                                                   \
  end

`define decode_store_insn(__name, __op_a_mux, __op_b_mux, __alu_op, __ls_strobe) \
  __name: begin                                                                  \
    op_a_mux = __op_a_mux; op_b_mux = __op_b_mux;                                \
    alu_op = __alu_op;                                                           \
    is_store = 1'b1;                                                             \
    ls_strobe = __ls_strobe;                                                     \
  end

  // decoder (2)
  // Configures the datapath muxes. Really repetive, hence the macros.
  always_comb begin : decoder
    pc_clear_lsb = 1'b0;
    pc_mux   = PC_NEXT;
    wb_mux   = WB_ALU;
    op_a_mux = MUX_ZERO;
    op_b_mux = MUX_ZERO;
    reg_write_valid = 1'b0;
    is_load    = 1'b0;
    is_store   = 1'b0;
    ls_strobe  = WORD;
    ls_signext = 1'b0;

    insn_illegal = 1'b0;
    alu_op = OP_ADD;

    casez (insn_data)
      //                   name  op a     op b     wb mux  pc mux     alu op  rd write
      `decode_regular_insn(BEQ,  MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SEQ, 1'b0)
      `decode_regular_insn(BNE,  MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SNE, 1'b0)
      `decode_regular_insn(BLT,  MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SLT, 1'b0)
      `decode_regular_insn(BGE,  MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SGE, 1'b0)
      `decode_regular_insn(BLTU, MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SLTU, 1'b0)
      `decode_regular_insn(BGEU, MUX_REG, MUX_REG, WB_ALU, PC_BRANCH, OP_SGEU, 1'b0)
      JALR: begin // rs1 + i_imm, zero lsb of result
        op_a_mux = MUX_REG; op_b_mux = MUX_I_IMM;
        wb_mux = WB_RET_PC; pc_mux = PC_ALU;
        alu_op = OP_ADD;
        reg_write_valid = 1'b1;
        pc_clear_lsb = 1'b1;
      end
      //                   name   op a      op b       wb mux     pc mux   alu op  rd write
      `decode_regular_insn(JAL,   MUX_PC,   MUX_J_IMM, WB_RET_PC, PC_ALU,  OP_ADD, 1'b1)
      `decode_regular_insn(LUI,   MUX_ZERO, MUX_U_IMM, WB_ALU,    PC_NEXT, OP_ADD, 1'b1)
      `decode_regular_insn(AUIPC, MUX_PC,   MUX_U_IMM, WB_ALU,    PC_NEXT, OP_ADD, 1'b1)
      `decode_regular_insn(ADDI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_ADD, 1'b1)
      `decode_regular_insn(SLLI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_SLL, 1'b1)
      `decode_regular_insn(SLTI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_SLT, 1'b1)
      `decode_regular_insn(SLTIU, MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_SLTU, 1'b1)
      `decode_regular_insn(XORI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_XOR, 1'b1)
      `decode_regular_insn(SRLI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_SRL, 1'b1)
      `decode_regular_insn(SRAI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_SRA, 1'b1)
      `decode_regular_insn(ORI,   MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_OR,  1'b1)
      `decode_regular_insn(ANDI,  MUX_REG,  MUX_I_IMM, WB_ALU,    PC_NEXT, OP_AND, 1'b1)
      `decode_regular_insn(ADD,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_ADD, 1'b1)
      `decode_regular_insn(SUB,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SUB, 1'b1)
      `decode_regular_insn(SLL,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SLL, 1'b1)
      `decode_regular_insn(SLT,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SLT, 1'b1)
      `decode_regular_insn(SLTU,  MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SLTU, 1'b1)
      `decode_regular_insn(XOR,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_XOR, 1'b1)
      `decode_regular_insn(SRL,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SRL, 1'b1)
      `decode_regular_insn(SRA,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_SRA, 1'b1)
      `decode_regular_insn(OR,    MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_OR,  1'b1)
      `decode_regular_insn(AND,   MUX_REG,  MUX_REG,   WB_ALU,    PC_NEXT, OP_AND, 1'b1)

      //                name  op a    op b       wb mux    alu op  rdw   strobe    sext
      `decode_load_insn(LB,   MUX_REG, MUX_I_IMM, WB_LOAD, OP_ADD, 1'b1, BYTE,     1'b1)
      `decode_load_insn(LH,   MUX_REG, MUX_I_IMM, WB_LOAD, OP_ADD, 1'b1, HALFWORD, 1'b1)
      `decode_load_insn(LW,   MUX_REG, MUX_I_IMM, WB_LOAD, OP_ADD, 1'b1, WORD,     1'b1)
      `decode_load_insn(LBU,  MUX_REG, MUX_I_IMM, WB_LOAD, OP_ADD, 1'b1, BYTE,     1'b0)
      `decode_load_insn(LHU,  MUX_REG, MUX_I_IMM, WB_LOAD, OP_ADD, 1'b1, HALFWORD, 1'b0)

      //                 name  op a     op b       alu op  strobe
      `decode_store_insn(SB,   MUX_REG, MUX_S_IMM, OP_ADD, BYTE)
      `decode_store_insn(SH,   MUX_REG, MUX_S_IMM, OP_ADD, HALFWORD)
      `decode_store_insn(SW,   MUX_REG, MUX_S_IMM, OP_ADD, WORD)

      //                   name     op a      op b      wb mux  pc mux   alu op  rd write
      `decode_regular_insn(FENCE,   MUX_ZERO, MUX_ZERO, WB_ALU, PC_NEXT, OP_ADD, 1'b0)
      `decode_regular_insn(FENCE_I, MUX_ZERO, MUX_ZERO, WB_ALU, PC_NEXT, OP_ADD, 1'b0)

      default: begin
        insn_illegal = 1'b1;
      end
    endcase // casez (insn_data)
  end

`undef decode_regular_insn
`undef decode_load_insn
`undef decode_store_insn

`ifndef SYNTHESIS
`ifndef __ICARUS__
  // warn us (broken code or core)
  assert property (@(posedge clk_i) disable iff (!rst_ni)
    !(insn_illegal && insn_ok))
    else $error("[tb]: illegal instruction decoded value=%x", insn_data);
`endif
`endif

  // operand muxes for feeding alu
 
`ifdef __ICARUS__
  always @(*) begin : mux_op_a
`else
  always_comb begin : mux_op_a
`endif
    // pc or reg_a
    case (op_a_mux)
      MUX_ZERO: alu_op_a = 32'b0;
      MUX_PC:   alu_op_a = pc_q;
      MUX_REG:  alu_op_a = reg_a;
      default:  alu_op_a = 32'b0;
    endcase // case (op_a_mux)
  end : mux_op_a

`ifdef __ICARUS__
  always @(*) begin : mux_op_b
`else
  always_comb begin : mux_op_b
`endif
    // imm or reg_b
    case (op_b_mux)
      MUX_ZERO:  alu_op_b = 32'b0;
      MUX_REG:   alu_op_b = reg_b;
      MUX_U_IMM: alu_op_b = u_imm;
      MUX_I_IMM: alu_op_b = i_imm;
      MUX_J_IMM: alu_op_b = j_imm;
      MUX_S_IMM: alu_op_b = s_imm;
      default:  alu_op_b = 32'b0;
    endcase // case (op_b_mux)
  end : mux_op_b

  // alu
`ifdef __ICARUS__
  always @(*) begin : alu
`else
  always_comb begin : alu
`endif
    // TODO: remove this tmp assignment
    alu_result = 32'b0;

    // TODO: share adder for all operations
    // TODO: share shifter
    case (alu_op)
      OP_ADD: begin
        alu_result = alu_op_a + alu_op_b;
      end
      OP_SUB: begin
        alu_result = alu_op_a - alu_op_b;
      end
      OP_SLL: begin
        alu_result = alu_op_a << alu_op_b[4:0];
      end
      OP_SEQ: begin
        alu_result = alu_op_a == alu_op_b;
      end
      OP_SNE: begin
        alu_result = alu_op_a != alu_op_b;
      end
      OP_SLT: begin
        alu_result = $signed(alu_op_a) < $signed(alu_op_b);
      end
      OP_SGE: begin
        alu_result = $signed(alu_op_a) >= $signed(alu_op_b);
      end
      OP_SLTU: begin
        alu_result = alu_op_a < alu_op_b; // sv lrm 2017 p. 263
      end
      OP_SGEU: begin
        alu_result = alu_op_a >= alu_op_b; // sv lrm 2017 p. 263
      end
      OP_XOR: begin
        alu_result = alu_op_a ^ alu_op_b;
      end
      OP_SRL: begin
        alu_result = alu_op_a >> alu_op_b[4:0];  // sv lrm 2017 p. 269
      end
      OP_SRA: begin
        alu_result = $signed(alu_op_a) >>> alu_op_b[4:0];  // sv lrm 2017 p. 269 (!)
      end
      OP_OR: begin
        alu_result = alu_op_a | alu_op_b;
      end
      OP_AND: begin
        alu_result = alu_op_a & alu_op_b;
      end
      default: begin
        alu_result = 32'b0; //TODO: better default
      end
    endcase // case (alu_op)
  end : alu

  // load store logic
  logic [31:0] load_result;
  logic        load_commit;
  logic        load_valid;

  logic        store_commit;
  logic        store_valid;

  assign data_addr_o   = alu_result; //TODO: silence for debugging
  assign data_wen_o    = !is_load;
  assign data_wdata_o  = reg_b; // store is always from reg[rs2]
  // data_rvalid_i =

`ifdef __ICARUS__
  always @(*) begin : load_sext
`else
  always_comb begin : load_sext
`endif
    case (ls_strobe)
      WORD:     load_result = data_rdata_i[31:0];
      HALFWORD: load_result = ls_signext ? $signed(data_rdata_i[15:0]) : data_rdata_i[15:0];
      BYTE:     load_result = ls_signext ? $signed(data_rdata_i[7:0])  : data_rdata_i[7:0];
      default:  load_result = data_rdata_i[31:0];
    endcase // case (ls_strobe)
  end : load_sext
  
  assign data_strb_o   = ls_strobe;

  // stores are commited as soon as the ready goes high. This means in the next
  // cycle is_store automatically goes low
  // loads need seperate handling since they wait until the response returns
  assign data_valid_o  = load_valid | store_valid;
  // assign data_ready_i

  typedef enum logic [3:0] {
    ADDRESS, WAIT_RESPONSE
  } load_state_e;
  load_state_e load_state_q, load_state_d;

  // We remember the target register and wait until we get the load value back.
  // Everything will then be committed to regs by raising load_commit. This also
  // means the core will always stall on loads (until loads fully complete).
  always_comb begin : handle_load
    load_state_d = load_state_q;
    load_commit  = 1'b0;
    load_valid   = is_load;
    load_stall   = 1'b0;
    case (load_state_q)
      ADDRESS: begin
        load_stall = is_load;
        load_valid = is_load;
        if (data_valid_o && data_ready_i && is_load) begin
          load_state_d = WAIT_RESPONSE;
        end
      end
      WAIT_RESPONSE: begin
        load_stall = 1'b1;
        load_valid = 1'b0;
        if (data_rvalid_i) begin
          load_stall = 1'b0;
          load_commit = 1'b1;
          load_state_d = ADDRESS;
        end
      end
      default:;
    endcase // case (load_state_q)
  end : handle_load

  always_ff @(posedge clk_i, negedge rst_ni) begin : load
    if (!rst_ni) begin
      load_state_q <= ADDRESS;
    end else begin
      load_state_q <= load_state_d;
    end
  end : load

  assign store_valid  = is_store;
  assign store_stall  = data_valid_o & ~data_ready_i & is_store;
  assign store_commit = data_valid_o &  data_ready_i & is_store;

  // retiring instructions
  // regular instructions need a single cycle to finish
  // for loads we wait until the result returns
  // for stores we wait until the address handshake happened
  assign commit = (insn_ok & ~is_load & ~is_store) | load_commit | store_commit;

  // writeback to regs
  always_comb begin : writeback_mux
    case (wb_mux)
      WB_RET_PC: wb_value = pc_next;
      WB_ALU:    wb_value = alu_result;
      WB_LOAD:   wb_value = load_result;
      default:   wb_value = 32'b0; //TODO: after debugging replace with alu_result
    endcase // case (wb_mux)
  end : writeback_mux

  // gprs
  logic [31:0] regs [0:31];
  assign reg_a = rs1 != 5'b0 ? regs[rs1] : 32'b0;
  assign reg_b = rs2 != 5'b0 ? regs[rs2] : 32'b0;

  integer i;
  initial begin
	  for (i = 0; i < 32; i = i + 1)
		  regs[i] = 'h0;
  end

  always_ff @(posedge clk_i) begin : gpr
      if (reg_write_valid && commit && (rd != 5'b0))
        regs[rd] <= wb_value;
  end : gpr

endmodule // catv_riscv
